Applying models to Nanopore data#

Step 1: Read output from modkit#

Hide code cell source
path = r"../Data/Nanopore_data/Kasumi1-naive-p2solo.bed/Kasumi1-naive-p2solo.bed"
sample_name = 'Kasumi1-naive'

import pandas as pd

# Column names for the two parts
columns_part1 = ["chrom", "start_position", "end_position", "modified_base_code", "score", "strand", "start_position_compat", "end_position_compat", "color"]
columns_part2 = ["N_valid_cov", "fraction_modified", "N_mod", "N_canonical", "N_other_mod", "N_delete", "N_fail", "N_diff", "N_nocall"]

# Read the first part with '\t' delimiter
df_part1 = pd.read_csv(path, sep='\t', header=None, usecols=range(9), names=columns_part1)

# Read the second part with ' ' delimiter
df_part2 = pd.read_csv(path, delim_whitespace=True, header=None, usecols=range(9, 18), names=columns_part2)

# Concatenate the two dataframes horizontally
df = pd.concat([df_part1, df_part2], axis=1)

# Filter the dataframe by modified base code
df_5mc = df[df["modified_base_code"] == "m"]
df_5hmc = df[df["modified_base_code"] == "h"]

df['modified_base_code'].value_counts(dropna=False)
modified_base_code
h    28450341
m    28450341
Name: count, dtype: int64
Hide code cell source
df_5mc['score'].mean()
13.171531265653371

Step 2: Load CpG probe array coordinates for build 38#

Hide code cell source
# Load the list of suboptimal probes
zhou2016_probes = pd.read_csv('../Data/UnreliableProbesList_Zhou2016/EPIC.anno.GRCh38.tsv', sep='\t')

# Add a column with the coordinate of the probe
df_5mc['coordinate'] = df_5mc['chrom'].astype(str) + ':' + df_5mc['start_position'].astype(str)
zhou2016_probes['coordinate'] = zhou2016_probes['chrm'].astype(str) + ':' + zhou2016_probes['start'].astype(str)

# Merge the two dataframes
df_5mc_merged = pd.merge(df_5mc, zhou2016_probes[['probeID','coordinate']], on='coordinate', how='inner')

# Set the index to the probeID
df_5mc_merged = df_5mc_merged.set_index('probeID')

# Create beta values column
df_5mc_merged[sample_name] = (df_5mc_merged['fraction_modified'] / 100).round(3)

# Create a new dataframe with only the beta values
df_nanopore = df_5mc_merged[[sample_name]].T

df_nanopore
/tmp/ipykernel_290483/610380037.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_5mc['coordinate'] = df_5mc['chrom'].astype(str) + ':' + df_5mc['start_position'].astype(str)
probeID cg14817997 cg26928153 cg16269199 cg13869341 cg14008030 cg12045430 cg20826792 cg18231760 cg02404219 cg21870274 ... cg00180869 cg05964935 cg00618696 cg25363292 cg12170560 cg08890132 cg00491786 cg26559055 cg07587934 cg16855331
Kasumi1-naive 1.0 1.0 1.0 0.938 0.941 0.0 0.0 0.5 1.0 1.0 ... 0.867 0.667 0.901 0.92 0.971 0.953 0.714 1.0 0.967 0.734

1 rows × 860482 columns

Step 3: Merge results with train and test array data#

Load array data#

Hide code cell source
import pandas as pd
import numpy as np
from source.pacmap_functions import *

input_path = '../Data/Intermediate_Files/'
output_path = '../Data/Processed_Data/'

# read df_discovery and df_validation
df_discovery = pd.read_pickle(
    input_path+'df_discovery.pkl').sort_index()

# Load clinical data
discovery_clinical_data = pd.read_csv(input_path+'discovery_clinical_data.csv',
                                      low_memory=False, index_col=0)

# Create a new dataframe to contain metadata for `df_nanopore`, which represents sample 'UF_hem_1832_PB'
validation_clinical_data = pd.DataFrame(index=df_nanopore.index)

# Adjust clinical data
discovery_clinical_data['Train Test'] = 'Discovery (train) Samples'
validation_clinical_data['Train Test'] = sample_name

discovery_clinical_data['PaCMAP Output'] = 'Patient Samples'
validation_clinical_data['PaCMAP Output'] = 'Patient Samples'

discovery_clinical_data['Batch'] = df_discovery['Batch']
validation_clinical_data['Batch'] = 'Nanopore'

# add the following columns to `validation_clinical_data`: 'Clinical Trial', 'Sample Type', 'Patient_ID', 'ELN AML 2022 Diagnosis', 'Hematopoietic Lineage'
validation_clinical_data['Clinical Trial'] = 'UF Hem Bank'
validation_clinical_data['Sample Type'] = 'Peripheral Blood'
validation_clinical_data['Patient_ID'] = sample_name
validation_clinical_data['ELN AML 2022 Diagnosis'] = np.nan
validation_clinical_data['Hematopoietic Lineage'] = np.nan

Select CpGs in both train and nanopore sample#

Hide code cell source
# use overlapping features between df_discovery and df_nanopore
common_features = [x for x in df_discovery.columns if x in df_nanopore.columns]

# apply `common_features` to both df_discovery and df_nanopore
df_discovery = df_discovery[common_features]
df_nanopore = df_nanopore[common_features]

print(
f' Discovery dataset (df_discovery) contains {df_discovery.shape[1]} \
columns (5mC nucleotides/probes) and {df_discovery.shape[0]} rows (samples).')

print(
f' Validation dataset (df_nanopore) contains {df_nanopore.shape[1]} \
columns (5mC nucleotides/probes) and {df_nanopore.shape[0]} rows (samples).')

output_notebook()
df_validation = df_nanopore.copy()

# Set the theme for the plot
curdoc().theme = 'light_minimal' # or 'dark_minimal'
 Discovery dataset (df_discovery) contains 332092 columns (5mC nucleotides/probes) and 3330 rows (samples).
 Validation dataset (df_nanopore) contains 332092 columns (5mC nucleotides/probes) and 1 rows (samples).
Loading BokehJS ...
Hide code cell source
clinical_trials = ['AAML0531', 'AAML1031', 'AAML03P1', 'CCG2961', 'Japanese AML05']

sample_types = ['Diagnosis', 'Primary Blood Derived Cancer - Bone Marrow', 'Bone Marrow Normal',
                'Primary Blood Derived Cancer - Peripheral Blood', 'Blood Derived Normal']

cols = ['Clinical Trial', 'Sample Type', 'Patient_ID', 'ELN AML 2022 Diagnosis', 'Train Test', 'Batch']

components = [2]
for n in components:
    processor = DataProcessor(discovery_clinical_data.copy(),
                              df_discovery,
                              clinical_trials,
                              sample_types,
                              cols, 
                              n_components=n,
                              common_prefix=output_path+f'pacmap_output/pacmap_{n}d_model_peds_dx_aml', 
                              df_test=df_validation.copy(),
                              test_clinical_data=validation_clinical_data.copy())
    
    processor.filter_data()
    processor.apply_pacmap() # learn PaCMAP on the training data
    processor.apply_pacmap_test() # apply PaCMAP to the test data
    processor.join_labels() # join clinical data to the embedding

df = processor.df.copy()

#     # Save output
#     processor.df.to_csv(output_path+f'pacmap_output/pacmap_{n}d_model_peds_dx_aml.csv')
The PaCMAP instance is successfully saved at ../Data/Processed_Data/pacmap_output/pacmap_2d_model_peds_dx_aml.pkl.
To load the instance again, please do `pacmap.load(../Data/Processed_Data/pacmap_output/pacmap_2d_model_peds_dx_aml)`.

Plot Sample in AML Map#

Hide code cell source
# Concatenate discovery and validation clinical data
clinical_data = pd.concat([discovery_clinical_data, validation_clinical_data]).loc[df['index']]

# Select columns to plot
cols = ['PaCMAP Output','Hematopoietic Lineage','WHO AML 2022 Diagnosis','ELN AML 2022 Diagnosis', 'FAB', 'FLT3 ITD', 'Age (group years)',
        'Complex Karyotype', 'Primary Cytogenetic Code' ,'Batch', 'Sex', 'MRD 1 Status',
        'Leucocyte counts (10⁹/L)', 'Risk Group', 'Race or ethnic group',
        'Clinical Trial', 'Vital Status','First Event','Sample Type', 'Train Test']

# Join clinical data to the embedding
df = df.join(clinical_data[cols], rsuffix='_copy', on='index')

plotter = BokehPlotter(df, cols, get_custom_color_palette(),
                       title='The Methylome Atlas of Pediatric AML',
                        x_range=(-45, 45), y_range=(-45, 45),
                        datapoint_size=3, tooltip_dx_cols='ELN AML 2022 Diagnosis',
                        width=1000, height=500)
plotter.plot()